/*
 * Big matrix multiplication module
 * */

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <errno.h>
#include <time.h>
#include "base.h"
#include "factor.h"

extern float matrix_mult(const int, struct FactorSet *, float *, float *, float *);
extern void matrix_mult_4x4(const float *, const float *, float *, int);

/*
 * @Function:		big_matrix_mult
 * @Input
 * @Return:
 * */
float matrix_mult(const int N, struct FactorSet *fs, float *A, float *B, float *T)
{
	int n = N/4; /* number of blocks */
	int i, j, k;
	float *ma, *mb, *mt;
	float diff_time;
	struct timespec start, stop;

	clock_gettime(CLOCK_MONOTONIC, &start);
	for (i=0; i<n; i++) {
		for (j=0; j<n; j++) {
			for (k=0; k<n; k++) {
				ma = &A[4*4*n*i+4*k];
				mb = &B[4*4*n*k+4*j];
				mt = &T[4*4*n*i+4*j];
				matrix_mult_4x4(ma, mb, mt, N);
			}
		}
	}
	clock_gettime(CLOCK_MONOTONIC, &stop);

	diff_time = calc_diff_time(&start, &stop)/1000; /* us */
	/* computations is the 4*4 multiplication effort */
	float alpha = 0.3;
	//fs->t_comp = fs->t_comp * (1-alpha) + (diff_time/(n*n*n)) * alpha; /* us */
	fs->t_comp = diff_time/(n*n*n); /* us */

	return diff_time;
}

/*
 * @Function:		matrix_mult_4x4
 * @Input
 * 		A, B:		input matrices
 * 		T:			output matrix
 * 		n:			number of columns (equal to number of rows)
 * @Return:			none
 * @Description:	get input matrices A B , calculate 4 by 4 multiplication,
 * 					then store to result matrix T
 * 	-----------------------
 * 	| 0 |   1 |   2 |   3 |
 * 	| n | n+1 | n+2 | n+3 |
 * 	|2n |2n+1 |2n+2 |2n+3 |
 * 	|3n |3n+1 |3n+2 |3n+3 |
 * 	-----------------------
 * */
void matrix_mult_4x4(const float *A, const float *B, float *T, int n)
{
	/* T = A x B */
	T[0]	+= A[0]*B[0]	+ A[1]*B[n]			+ A[2]*B[2*n]		+ A[3]*B[3*n];
	T[1]	+= A[0]*B[1]	+ A[1]*B[n+1]		+ A[2]*B[2*n+1]		+ A[3]*B[3*n+1];
	T[2]	+= A[0]*B[2]	+ A[1]*B[n+2]		+ A[2]*B[2*n+2]		+ A[3]*B[3*n+2];
	T[3]	+= A[0]*B[3]	+ A[1]*B[n+3]		+ A[2]*B[2*n+3]		+ A[3]*B[3*n+3];
	T[n]	+= A[n]*B[0]	+ A[n+1]*B[n]		+ A[n+2]*B[2*n]		+ A[n+3]*B[3*n];
	T[n+1]	+= A[n]*B[1]	+ A[n+1]*B[n+1] 	+ A[n+2]*B[2*n+1]	+ A[n+3]*B[3*n+1];
	T[n+2]	+= A[n]*B[2]	+ A[n+1]*B[n+2] 	+ A[n+2]*B[2*n+2]	+ A[n+3]*B[3*n+2];
	T[n+3]	+= A[n]*B[3]	+ A[n+1]*B[n+3] 	+ A[n+2]*B[2*n+3]	+ A[n+3]*B[3*n+3];
	T[2*n]	+= A[2*n]*B[0]	+ A[2*n+1]*B[n] 	+ A[2*n+2]*B[2*n]	+ A[2*n+3]*B[3*n];
	T[2*n+1]+= A[2*n]*B[1]	+ A[2*n+1]*B[n+1]	+ A[2*n+2]*B[2*n+1]	+ A[2*n+3]*B[3*n+1];
	T[2*n+2]+= A[2*n]*B[2]	+ A[2*n+1]*B[n+2]	+ A[2*n+2]*B[2*n+2]	+ A[2*n+3]*B[3*n+2];
	T[2*n+3]+= A[2*n]*B[3]	+ A[2*n+1]*B[n+3]	+ A[2*n+2]*B[2*n+3]	+ A[2*n+3]*B[3*n+3];
	T[3*n]	+= A[3*n]*B[0]	+ A[3*n+1]*B[n] 	+ A[3*n+2]*B[2*n]	+ A[3*n+3]*B[3*n];
	T[3*n+1]+= A[3*n]*B[1]	+ A[3*n+1]*B[n+1]	+ A[3*n+2]*B[2*n+1]	+ A[3*n+3]*B[3*n+1];
	T[3*n+2]+= A[3*n]*B[2]	+ A[3*n+1]*B[n+2]	+ A[3*n+2]*B[2*n+2]	+ A[3*n+3]*B[3*n+2];
	T[3*n+3]+= A[3*n]*B[3]	+ A[3*n+1]*B[n+3]	+ A[3*n+2]*B[2*n+3]	+ A[3*n+3]*B[3*n+3];
}

